# Title: Measuring Interethnic Marriage in Africa
# Author: Branden Bohrnsen, bohrnsen@umich.edu
# Date: 1/30/2026
# Description: Validate census-based estimates using DHS data.

library(pacman)
p_load(tidyverse, haven, here, patchwork, ggpubr, knitr)
options(scipen = 999)
set.seed(07151129)
here()

dhs_1996 <- read_dta(here("data", "dhs", "Zambia1996", "ZMCR31DT", "ZMCR31FL.DTA"))
dhs_2001 <- read_dta(here("data", "dhs", "Zambia2001-02", "ZMCR42DT", "ZMCR42FL.DTA"))
dhs_2013 <- read_dta(here("data", "dhs", "Zambia2013-14", "Couples' Recode", "ZMCR61FL.DTA"))

dhs_1996 <- dhs_1996 %>%
    select(v024, v025, v012, mv012, v511, mv511, v508, mv508, v131, mv131)

dhs_1996 <- dhs_1996 %>%
    mutate(
        year = 1996,
        v131 = haven::as_factor(v131),
        mv131 = haven::as_factor(mv131),
        region = haven::as_factor(v024),
        urban_rural = haven::as_factor(v025)
    ) %>%
    rename(
        age_woman = v012,
        age_man = mv012,
        age_at_first_marriage_woman = v511,
        age_at_first_marriage_man = mv511,
        year_of_first_marriage_woman = v508,
        year_of_first_marriage_man = mv508,
        ethnicity_woman = v131,
        ethnicity_man = mv131
    ) %>%
    select(-v024, -v025)

attr(dhs_1996$age_man, "labels") <- attr(dhs_1996$age_woman, "labels")
attr(dhs_1996$age_at_first_marriage_man, "labels") <- attr(dhs_1996$age_at_first_marriage_woman, "labels")
attr(dhs_1996$year_of_first_marriage_man, "labels") <- attr(dhs_1996$year_of_first_marriage_woman, "labels")
levels(dhs_1996$ethnicity_man) <- levels(dhs_1996$ethnicity_woman)

dhs_2001 <- dhs_2001 %>%
    select(v024, v025, v012, mv012, v511, mv511, v508, mv508, v131, mv131)

dhs_2001 <- dhs_2001 %>%
    mutate(
        year = 2001,
        v131 = haven::as_factor(v131),
        mv131 = haven::as_factor(mv131),
        region = haven::as_factor(v024),
        urban_rural = haven::as_factor(v025)
    ) %>%
    rename(
        age_woman = v012,
        age_man = mv012,
        age_at_first_marriage_woman = v511,
        age_at_first_marriage_man = mv511,
        year_of_first_marriage_woman = v508,
        year_of_first_marriage_man = mv508,
        ethnicity_woman = v131,
        ethnicity_man = mv131
    ) %>%
    select(-v024, -v025)

attr(dhs_2001$age_man, "labels") <- attr(dhs_2001$age_woman, "labels")
attr(dhs_2001$age_at_first_marriage_man, "labels") <- attr(dhs_2001$age_at_first_marriage_woman, "labels")
attr(dhs_2001$year_of_first_marriage_man, "labels") <- attr(dhs_2001$year_of_first_marriage_woman, "labels")
levels(dhs_2001$ethnicity_man) <- levels(dhs_2001$ethnicity_woman)

dhs_2013 <- dhs_2013 %>%
    select(v024, v025, v012, mv012, v511, mv511, v508, mv508, v131, mv131)

dhs_2013 <- dhs_2013 %>%
    mutate(
        year = 2013,
        v131 = haven::as_factor(v131),
        mv131 = haven::as_factor(mv131),
        region = haven::as_factor(v024),
        urban_rural = haven::as_factor(v025)
    ) %>%
    rename(
        age_woman = v012,
        age_man = mv012,
        age_at_first_marriage_woman = v511,
        age_at_first_marriage_man = mv511,
        year_of_first_marriage_woman = v508,
        year_of_first_marriage_man = mv508,
        ethnicity_woman = v131,
        ethnicity_man = mv131
    ) %>%
    select(-v024, -v025)

attr(dhs_2013$age_man, "labels") <- attr(dhs_2013$age_woman, "labels")
attr(dhs_2013$age_at_first_marriage_man, "labels") <- attr(dhs_2013$age_at_first_marriage_woman, "labels")
attr(dhs_2013$year_of_first_marriage_man, "labels") <- attr(dhs_2013$year_of_first_marriage_woman, "labels")
levels(dhs_2013$ethnicity_man) <- levels(dhs_2013$ethnicity_woman)

dhs <- bind_rows(
    dhs_1996,
    dhs_2001,
    dhs_2013
)

dhs <- dhs %>%
    mutate(
        year_of_first_marriage_woman = if_else(year_of_first_marriage_woman < 100, 1900 + year_of_first_marriage_woman, year_of_first_marriage_woman),
        year_of_first_marriage_man = if_else(year_of_first_marriage_man < 100, 1900 + year_of_first_marriage_man, year_of_first_marriage_man)
    )

dhs$Intergroup <- as.integer(dhs$ethnicity_woman != dhs$ethnicity_man)

dhs$decade_of_first_marriage <- cut(dhs$year_of_first_marriage_woman,
                                    breaks = seq(1921, 2021, by = 10),
                                    right = FALSE,
                                    labels = c("1920s", "1930s", "1940s", "1950s", "1960s",
                                              "1970s", "1980s", "1990s", "2000s", "2010s"))

dhs <- dhs %>%
    mutate(ethnic_block = case_when(
        # Bemba Block
        ethnicity_woman %in% c("bemba", "lunda (luapula)", "lala", "bisa", "ushi", "chishinga",
                    "ngumbo", "lamba", "kabende", "tabwa", "swaka", "swawka",
                    "mukulu", "mukulo", "ambo", "lima", "shila", "unga", "bwile",
                    "luano", "lungu", "mambwe", "namwanga", "senga", "tambo") ~ "Bemba",

        # Tonga Block
        ethnicity_woman %in% c("tonga", "lenje", "soli", "ila", "toka-leya", "sala") ~ "Tonga",

        # Northwestern Block
        ethnicity_woman %in% c("luvale", "lunda (north-western)", "lunda (northwestern)", "mbunda",
                    "luchazi", "ndembu", "mbowe", "chokwe", "kaonde", "kaonde sub-group",
                    "kaonde subgroup", "luyana", "luyana subgroup", "luana sub-group") ~ "Northwestern",

        # Lozi Block
        ethnicity_woman %in% c("lozi", "totela", "subiya", "nkoya", "mashasha", "mashi", "kwangwa",
            "kwanga", "nyengo", "kwandi", "mwenyi", "nwenyi", "simaa",
            "imilangu", "koma", "wina") ~ "Lozi",

        # Nyanja Block
        ethnicity_woman %in% c("chewa", "nsenga", "ngoni", "nyanja", "kunda", "chikunda",
                    "chicunda", "tumbuka", "gowa", "yombe") ~ "Nyanja",

        # Default catch-all for non-Zambian, DK, missing, etc.
        TRUE ~ "Other"
    )) %>%
    mutate(ethnic_block_male = case_when(
        # Bemba Block
        ethnicity_man %in% c("bemba", "lunda (luapula)", "lala", "bisa", "ushi", "chishinga",
                    "ngumbo", "lamba", "kabende", "tabwa", "swaka", "swawka",
                    "mukulu", "mukulo", "ambo", "lima", "shila", "unga", "bwile",
                    "luano", "lungu", "mambwe", "namwanga", "senga", "tambo") ~ "Bemba",

        # Tonga Block
        ethnicity_man %in% c("tonga", "lenje", "soli", "ila", "toka-leya", "sala") ~ "Tonga",

        # Northwestern Block
        ethnicity_man %in% c("luvale", "lunda (north-western)", "lunda (northwestern)", "mbunda",
                    "luchazi", "ndembu", "mbowe", "chokwe", "kaonde", "kaonde sub-group",
                    "kaonde subgroup", "luyana", "luyana subgroup", "luana sub-group") ~ "Northwestern",

        # Lozi Block
        ethnicity_man %in% c("lozi", "totela", "subiya", "nkoya", "mashasha", "mashi", "kwangwa",
            "kwanga", "nyengo", "kwandi", "mwenyi", "nwenyi", "simaa",
            "imilangu", "koma", "wina") ~ "Lozi",

        # Nyanja Block
        ethnicity_man %in% c("chewa", "nsenga", "ngoni", "nyanja", "kunda", "chikunda",
                    "chicunda", "tumbuka", "gowa", "yombe") ~ "Nyanja",

        # Default catch-all for non-Zambian, DK, missing, etc.
        TRUE ~ "Other"
    ))

dhs$Interblock <- as.integer(dhs$ethnic_block != dhs$ethnic_block_male)

dhs <- dhs %>% filter(ethnic_block != "Other", ethnic_block_male != "Other")

dhs %>%
    group_by(year) %>%
    reframe(
        mean(Intergroup, na.rm = T),
        sd(Intergroup, na.rm = T)
    ) %>% kable(format = "html")

dhs %>%
    reframe(
        mean(Intergroup, na.rm = T),
        sd(Intergroup, na.rm = T),
        n()
    ) %>% kable(format = "html")

dhs %>%
    group_by(decade_of_first_marriage) %>%
    reframe(
        mean(Intergroup, na.rm = T),
        sd(Intergroup, na.rm = T),
        mean(year_of_first_marriage_woman, na.rm = T)
    ) %>% kable(format = "html")

dhs %>%
    group_by(region) %>%
    reframe(
        mean(Intergroup, na.rm = T),
        sd(Intergroup, na.rm = T)
    ) %>% kable(format = "html")

dhs %>%
    group_by(year) %>%
    reframe(
        mean(Interblock, na.rm = T),
        sd(Interblock, na.rm = T)
    ) %>% kable(format = "html")

dhs %>%
    reframe(
        mean(Interblock, na.rm = T),
        sd(Interblock, na.rm = T)
    ) %>% kable(format = "html")

dhs %>%
    group_by(decade_of_first_marriage) %>%
    reframe(
        mean(Interblock, na.rm = T),
        sd(Interblock, na.rm = T),
        mean(year_of_first_marriage_woman, na.rm = T)
    ) %>% kable(format = "html")

dhs %>%
    group_by(region) %>%
    reframe(
        mean(Interblock, na.rm = T),
        sd(Interblock, na.rm = T)
    ) %>% kable(format = "html")

# Calculate HHI for ethnicities by decade married
ethnicity_hhi <- dhs %>%
  filter(!is.na(decade_of_first_marriage), !is.na(ethnicity_woman)) %>%
  group_by(decade_of_first_marriage, ethnicity_woman) %>%
  summarise(n = n(), .groups = "drop_last") %>%
  mutate(
    total = sum(n),
    p = n / total,
    p_squared = p^2
  ) %>%
  summarise(
    HHI_Ethnicity = sum(p_squared),
    Diversity_Ethnicity = 1 - HHI_Ethnicity,
    N_Observations = unique(total),
    N_Groups = n()
  ) %>%
  mutate(decade_of_first_marriage = as.character(decade_of_first_marriage))

# Calculate HHI for ethnic blocks by decade married
block_hhi <- dhs %>%
  filter(!is.na(decade_of_first_marriage), !is.na(ethnic_block)) %>%
  group_by(decade_of_first_marriage, ethnic_block) %>%
  summarise(n = n(), .groups = "drop_last") %>%
  mutate(
    total = sum(n),
    p = n / total,
    p_squared = p^2
  ) %>%
  summarise(
    HHI_Block = sum(p_squared),
    Diversity_Block = 1 - HHI_Block,
    N_Observations = unique(total),
    N_Groups = n()
  ) %>%
  mutate(decade_of_first_marriage = as.character(decade_of_first_marriage))

# Combine the results
diversity_by_decade <- ethnicity_hhi %>%
  full_join(block_hhi, by = c("decade_of_first_marriage", "N_Observations"),
            suffix = c("_Ethnicity", "_Block"))

# Display results
kable(diversity_by_decade,
      caption = "HHI and Diversity by Decade of First Marriage",
      digits = 4)

# Calculate exposure measures by province for ethnicities
ethnicity_exposure <- dhs %>%
  filter(!is.na(ethnicity_woman), !is.na(region)) %>%
  group_by(region, ethnicity_woman) %>%
  summarise(n_group = n(), .groups = "drop_last") %>%
  mutate(
    total_province = sum(n_group),
    prop_group = n_group / total_province,
    Exposure_Ethnicity = 1 - prop_group
  ) %>%
  select(region, ethnicity_woman, Exposure_Ethnicity)

# Calculate exposure measures by province for ethnic blocks
block_exposure <- dhs %>%
  filter(!is.na(ethnic_block), !is.na(region)) %>%
  group_by(region, ethnic_block) %>%
  summarise(n_group = n(), .groups = "drop_last") %>%
  mutate(
    total_province = sum(n_group),
    prop_group = n_group / total_province,
    Exposure_Block = 1 - prop_group
  ) %>%
  select(region, ethnic_block, Exposure_Block)

# Join exposure measures back to main dataset
dhs_with_exposure <- dhs %>%
  left_join(ethnicity_exposure, by = c("region", "ethnicity_woman")) %>%
  left_join(block_exposure, by = c("region", "ethnic_block"))

# Calculate ratios of observed to expected intermarriage
intermarriage_ratios <- dhs_with_exposure %>%
  filter(!is.na(Exposure_Ethnicity), !is.na(Exposure_Block),
         !is.na(decade_of_first_marriage)) %>%
  group_by(decade_of_first_marriage) %>%
  summarise(
    N = n(),

    # Ethnicity measures
    Observed_Intergroup = mean(Intergroup, na.rm = TRUE),
    Expected_Intergroup = mean(Exposure_Ethnicity, na.rm = TRUE),
    Ratio_Intergroup = mean(Intergroup / Exposure_Ethnicity, na.rm = TRUE),

    # Block measures
    Observed_Interblock = mean(Interblock, na.rm = TRUE),
    Expected_Interblock = mean(Exposure_Block, na.rm = TRUE),
    Ratio_Interblock = mean(Interblock / Exposure_Block, na.rm = TRUE),

    .groups = "drop"
  )

kable(intermarriage_ratios,
      caption = "Observed vs Expected Intermarriage by Decade (DHS Data)",
      digits = 4)

# Also calculate by region for comparison
intermarriage_by_region <- dhs_with_exposure %>%
  filter(!is.na(Exposure_Ethnicity), !is.na(Exposure_Block)) %>%
  group_by(region) %>%
  summarise(
    N = n(),

    # Ethnicity measures
    Observed_Intergroup = mean(Intergroup, na.rm = TRUE),
    Expected_Intergroup = mean(Exposure_Ethnicity, na.rm = TRUE),
    Ratio_Intergroup = mean(Intergroup / Exposure_Ethnicity, na.rm = TRUE),

    # Block measures
    Observed_Interblock = mean(Interblock, na.rm = TRUE),
    Expected_Interblock = mean(Exposure_Block, na.rm = TRUE),
    Ratio_Interblock = mean(Interblock / Exposure_Block, na.rm = TRUE),

    .groups = "drop"
  )

kable(intermarriage_by_region,
      caption = "Observed vs Expected Intermarriage by Region (DHS Data)",
      digits = 4)

# Also calculate by urban/rural for comparison
intermarriage_by_urban_rural <- dhs_with_exposure %>%
  filter(!is.na(Exposure_Ethnicity), !is.na(Exposure_Block), !is.na(urban_rural)) %>%
  group_by(urban_rural) %>%
  summarise(
    N = n(),

    # Ethnicity measures
    Observed_Intergroup = mean(Intergroup, na.rm = TRUE),
    Expected_Intergroup = mean(Exposure_Ethnicity, na.rm = TRUE),
    Ratio_Intergroup = mean(Intergroup / Exposure_Ethnicity, na.rm = TRUE),

    # Block measures
    Observed_Interblock = mean(Interblock, na.rm = TRUE),
    Expected_Interblock = mean(Exposure_Block, na.rm = TRUE),
    Ratio_Interblock = mean(Interblock / Exposure_Block, na.rm = TRUE),

    .groups = "drop"
  )

kable(intermarriage_by_urban_rural,
      caption = "Observed vs Expected Intermarriage by Urban/Rural (DHS Data)",
      digits = 4)

# Also calculate by decade and urban/rural for comparison
intermarriage_by_decade_urban_rural <- dhs_with_exposure %>%
  filter(!is.na(Exposure_Ethnicity), !is.na(Exposure_Block),
         !is.na(decade_of_first_marriage), !is.na(urban_rural)) %>%
  group_by(decade_of_first_marriage, urban_rural) %>%
  summarise(
    N = n(),

    # Ethnicity measures
    Observed_Intergroup = mean(Intergroup, na.rm = TRUE),
    Expected_Intergroup = mean(Exposure_Ethnicity, na.rm = TRUE),
    Ratio_Intergroup = mean(Intergroup / Exposure_Ethnicity, na.rm = TRUE),

    # Block measures
    Observed_Interblock = mean(Interblock, na.rm = TRUE),
    Expected_Interblock = mean(Exposure_Block, na.rm = TRUE),
    Ratio_Interblock = mean(Interblock / Exposure_Block, na.rm = TRUE),

    .groups = "drop"
  )

kable(intermarriage_by_decade_urban_rural,
      caption = "Observed vs Expected Intermarriage by Decade and Urban/Rural (DHS Data)",
      digits = 4)

if (file.exists(here("data", "clean", "census.csv"))) {

  # Export Table 4 -------------------------------------
  
  # Load Census Data
  basic_summary <- read_csv(here("out", "final", "basic_summary.csv"))
  basic_geo <- read_csv(here("out", "final", "basic_by_geography.csv"))
  elf_overall <- read_csv(here("out", "final", "elf_overall.csv"))
  elf_geo <- read_csv(here("out", "final", "elf_urban_rural.csv"))
  marriages_overall <- read_csv(here("out", "final", "marriages_overall.csv"))
  marriages_geo <- read_csv(here("out", "final", "marriages_urban_rural.csv"))
  
  # Function to extract value for a specific level and metric
  get_val <- function(data, group_type, level_val, col_name, level_col = "Level") {
    if (level_val == "National") {
      res <- data %>% filter(Group_Type == group_type) %>% pull(!!sym(col_name))
    } else {
      # Handle Urban/Rural boolean or string
      if ("Urban" %in% names(data)) {
         is_urban <- level_val == "Urban"
         res <- data %>% filter(Group_Type == group_type, Urban == is_urban) %>% pull(!!sym(col_name))
      } else {
         res <- data %>% filter(Group_Type == group_type, !!sym(level_col) == level_val) %>% pull(!!sym(col_name))
      }
    }
    return(res)
  }
  
  # 1. DHS Observed (Block)
  dhs_nat <- mean(dhs$Interblock, na.rm = TRUE)
  dhs_urb <- intermarriage_by_urban_rural %>% filter(urban_rural == "urban") %>% pull(Observed_Interblock)
  dhs_rur <- intermarriage_by_urban_rural %>% filter(urban_rural == "rural") %>% pull(Observed_Interblock)
  
  # 2. Census Observed (Block)
  cen_obs_nat <- get_val(basic_summary, "Ethnic Block", "National", "Basic_Observed")
  cen_obs_urb <- get_val(basic_geo, "Ethnic Block", "Urban", "observed_rate")
  cen_obs_rur <- get_val(basic_geo, "Ethnic Block", "Rural", "observed_rate")
  
  # 3. Census Adjusted Country ELF (Block)
  # Basic_Adjusted = Observed / ELF (National) -> This is I_c (National)
  cen_adj_nat_nat <- get_val(basic_summary, "Ethnic Block", "National", "Basic_Adjusted")
  cen_adj_nat_urb <- get_val(basic_geo, "Ethnic Block", "Urban", "I_basic")
  cen_adj_nat_rur <- get_val(basic_geo, "Ethnic Block", "Rural", "I_basic")
  
  # 4. Census Adjusted Const ELF (Block) - Eq 1
  cen_adj_const_nat <- get_val(elf_overall, "Ethnic Block", "National", "I_community_diversity_national")
  cen_adj_const_urb <- get_val(elf_geo, "Ethnic Block", "Urban", "I_community_diversity_national")
  cen_adj_const_rur <- get_val(elf_geo, "Ethnic Block", "Rural", "I_community_diversity_national")
  
  # 5. Census Adjusted Exposure (Block) - Eq 2
  cen_adj_exp_nat <- get_val(marriages_overall, "Ethnic Block", "National", "I_group_share_national")
  cen_adj_exp_urb <- get_val(marriages_geo, "Ethnic Block", "Urban", "I_group_share_national")
  cen_adj_exp_rur <- get_val(marriages_geo, "Ethnic Block", "Rural", "I_group_share_national")
  
  # 6. Sample Sizes - Extract from each source file separately (different N due to NA handling)
  n_dhs <- nrow(dhs) # Total DHS marriages
  
  # Census_Observed and Census_Adj_Country_ELF use the same sample (basic_geo)
  n_census_observed <- basic_geo %>% 
    dplyr::filter(Group_Type == "Ethnic Block", Level == "National") %>%
    pull(n_marriages)
  
  # Census_Adj_Const_ELF uses elf_overall sample
  n_census_const_elf <- elf_overall %>%
    dplyr::filter(Group_Type == "Ethnic Block") %>%
    pull(total_n_marriages)
  
  # Census_Adj_Exposure uses marriages_overall sample
  n_census_exposure <- marriages_overall %>%
    dplyr::filter(Group_Type == "Ethnic Block") %>%
    pull(total_n_marriages)
  
  # Construct Table
  table4_full <- tibble(
    Level = c("Zambia", "Urban Zambia", "Rural Zambia"),
    DHS_Observed = c(dhs_nat, dhs_urb, dhs_rur),
    Census_Observed = c(cen_obs_nat, cen_obs_urb, cen_obs_rur),
    Census_Adj_Country_ELF = c(cen_adj_nat_nat, cen_adj_nat_urb, cen_adj_nat_rur),
    Census_Adj_Const_ELF = c(cen_adj_const_nat, cen_adj_const_urb, cen_adj_const_rur),
    Census_Adj_Exposure = c(cen_adj_exp_nat, cen_adj_exp_urb, cen_adj_exp_rur)
  )
  
  # Add Sample Size Row - each column uses its own N (different due to NA handling in each estimation)
  n_row <- tibble(
    Level = "Sample Size",
    DHS_Observed = n_dhs,
    Census_Observed = n_census_observed,
    Census_Adj_Country_ELF = n_census_observed,  # Uses same sample as Census_Observed
    Census_Adj_Const_ELF = n_census_const_elf,
    Census_Adj_Exposure = n_census_exposure
  )
  
  table4_full <- bind_rows(table4_full, n_row)

  if (!dir.exists(here("figures"))) {
    dir.create(here("figures"), recursive = TRUE)
  }
  
  write_csv(table4_full, here("out", "final", "Table_4_Replication_Full.csv"))
  write_csv(table4_full, here("figures", "Table_4_Replication_Full.csv"))
  cat("\nSaved Full Table 4 to out/final/Table_4_Replication_Full.csv\n")

} else {
  cat("\nCensus data not found. Saving DHS estimates only.\n")

  if (!dir.exists(here("figures"))) {
    dir.create(here("figures"), recursive = TRUE)
  }

  write_csv(intermarriage_ratios, here("out", "final", "Table_4_DHS_Estimates_Only.csv"))
  write_csv(intermarriage_ratios, here("figures", "Table_4_DHS_Estimates_Only.csv"))
}

